An insurance company has approached you with a dataset of previous claims of their clients. The insurance company wants you to develop a model to help them predict which claims look fraudulent. By doing so you hope to save the company millions of dollars annually.
Claim related fraud is a huge problem in the insurance industry. It is quite complex and difficult to identify those unwanted claims. With Random Forest Non-Parametric Machine Learning Algorithm, I am trying to troubleshoot and help the General Insurance industry with this problem.
The data that I have is from Automobile Insurance. I will be creating a predictive model that predicts if an insurance claim is fraudulent or not. The answere between YES/NO, is a Binary Classification task. A comparison study has been performed to understand which ML algorithm suits best to the dataset.
import os
os.getcwd()
#Importing required libraries
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import sklearn.metrics
from pylab import rcParams
%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
# pandas version 0.24 or upper is required
pd.__version__
#load & view raw data
df = pd.read_csv('F:/insurance_claims.csv')
df.head(10)
df.dtypes
df.columns
df.shape
df.nunique()
plt.style.use('fivethirtyeight')
ax = sns.distplot(df.age, bins=np.arange(19,64,5))
ax.set_ylabel('Density')
ax.set_xlabel('Age')
plt.show()
plt.style.use('fivethirtyeight')
ax = sns.countplot(x='fraud_reported', data=df, hue='fraud_reported')
ax.set_xlabel('Fraud Reported')
ax.set_ylabel('Fraud Count')
plt.show()
From above plot, like most fraud datasets, the label distribution is skewed.
df['fraud_reported'].value_counts() # Count number of frauds vs non-frauds
df['incident_state'].value_counts()
Here we see that almost 25% fraud reported. Let’s try to look for an indicative variable. Let’s analyze location. This dataset only has information from the mid-Atlantic states from the USA.
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(10,6))
ax = df.groupby('incident_state').fraud_reported.count().plot.bar(ylim=0)
ax.set_ylabel('Fraud Reported')
ax.set_xlabel('Incident State')
plt.show()
df['incident_state_count'] = df['incident_state']
for i in range(len(df['incident_state_count'])):
if df.iloc[i, 40] == "NY":
df.iloc[i, 40] = 262
if df.iloc[i, 40] == "SC":
df.iloc[i, 40] = 248
if df.iloc[i, 40] == "WV":
df.iloc[i, 40] = 217
if df.iloc[i, 40] == "VA":
df.iloc[i, 40] = 110
if df.iloc[i, 40] == "NC":
df.iloc[i, 40] = 110
if df.iloc[i, 40] == "PA":
df.iloc[i, 40] = 30
if df.iloc[i, 40] == "OH":
df.iloc[i, 40] = 23
from plotly.offline import plot
import plotly.graph_objs as go
data = [go.Choropleth(autocolorscale = True, locations = df['incident_state'],
z = df['incident_state_count'],
locationmode = 'USA-states',
marker = go.choropleth.Marker(line = go.choropleth.marker.Line(color = 'rgb(255,255,255)', width = 2)),
colorbar = go.choropleth.ColorBar(title = "Number of Incidents"))]
layout = go.Layout(
title = go.layout.Title(
text = 'Insurance Incident Claims on the Mid-Atlantic'
),
geo = go.layout.Geo(
scope = 'usa',
projection = go.layout.geo.Projection(type = 'albers usa'),
showlakes = True,
lakecolor = 'rgb(255, 255, 255)'),
)
fig = go.Figure(data = data, layout = layout)
#plot(fig, filename = 'd3-cloropleth-map') # for showing in seprate tab
fig.show()
plt.rcParams['figure.figsize'] = [15, 8]
ax= plt.style.use('fivethirtyeight')
table=pd.crosstab(df.age, df.fraud_reported)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Age vs Fraud Reported', fontsize=12)
plt.xlabel('Age')
plt.ylabel('Fraud Reported')
plt.show()
From above plot, it is obvious that, age is an important predictor for fraud reported. Age between 19-23 shows substantial number od fraud report.
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(18,8))
ax = df.groupby('incident_date').total_claim_amount.count().plot.bar(ylim=0)
ax.set_ylabel('Claim amount ($)')
ax.set_xlabel('Incident Date')
plt.show()
We see that, all the cases in above plot are for the months of January and February 2015
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(10,6))
ax = df.groupby('policy_state').fraud_reported.count().plot.bar(ylim=0)
ax.set_ylabel('Fraud Reported')
ax.set_xlabel('Policy State')
plt.show()
plt.rcParams['figure.figsize'] = [10, 6]
ax= plt.style.use('fivethirtyeight')
table=pd.crosstab(df.policy_state, df.fraud_reported)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Policy State vs Fraud Reported', fontsize=12)
plt.xlabel('Policy State')
plt.ylabel('Fraud Reported')
plt.show()
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(10,6))
ax = df.groupby('incident_type').fraud_reported.count().plot.bar(ylim=0)
ax.set_xticklabels(ax.get_xticklabels(), rotation=20, ha="right")
ax.set_ylabel('Fraud Reported')
ax.set_xlabel('Incident Type')
plt.show()
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(10,6))
ax = sns.countplot(x='incident_state', data=df)
ax.set_ylabel('Fraud Reported')
ax.set_xlabel('Incident State')
fig = plt.figure(figsize=(10,6))
ax = sns.countplot(y = 'insured_education_level', data=df)
ax.set_ylabel('Policy Annual Premium')
ax.set_xlabel('Insured Education Level')
plt.show()
# # Breakdown of Average Vehicle claim by insured's education level, grouped by fraud reported
fig = plt.figure(figsize=(16,10))
ax = sns.catplot(x='fraud_reported', y='policy_annual_premium',hue='insured_education_level', data=df,
kind="bar", ci=None, palette="muted",height=6, legend=True, aspect=1.2)
ax.set_axis_labels("Fraud Reported", "Policy Annual Premium")
plt.show()
plt.rcParams['figure.figsize'] = [14, 6]
table=pd.crosstab(df.insured_education_level, df.fraud_reported)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of insured education vs Fraud reported', fontsize=12)
plt.xlabel('Insured Education Level')
plt.ylabel('Fraud Reported')
plt.rcParams['figure.figsize'] = [6, 6]
ax = (df['insured_sex'].value_counts()*100.0 /len(df))\
.plot.pie(autopct='%.1f%%', labels = ['Male', 'Female'], fontsize=12)
ax.set_title('% Gender')
plt.ylabel('Insured Sex')
plt.show()
plt.rcParams['figure.figsize'] = [11, 6]
table=pd.crosstab(df.insured_sex, df.fraud_reported)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of insured_sex vs Fraud', fontsize=12)
plt.xlabel('Insured Sex')
plt.ylabel('Fraud Reported')
plt.show()
plt.rcParams['figure.figsize'] = [6, 6]
ax = (df['insured_relationship'].value_counts()*100.0 /len(df))\
.plot.pie(autopct='%.1f%%', labels = ['husband', 'wife', 'own-child', 'unmarried', 'other-relative', 'not-in-family'],
fontsize=12)
ax.set_title('% Relationship')
plt.ylabel('Insured Relationship')
plt.show()
table=pd.crosstab(df.insured_relationship, df.fraud_reported)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of insured_relationship vs Fraud', fontsize=12)
plt.xlabel('Insured Relationship')
plt.ylabel('Fraud Reported')
plt.show()
fig = plt.figure(figsize=(6,6))
ax = (df['incident_type'].value_counts()*100.0 /len(df))\
.plot.pie(autopct='%.1f%%', labels = ['Parked Car', 'Single Vehile Collision', 'Multi-vehicle Collision', 'Vehicle Theft'],
fontsize=12)
plt.ylabel('Incident Type')
fig = plt.figure(figsize=(6,6))
ax = (df['authorities_contacted'].value_counts()*100.0 /len(df))\
.plot.pie(autopct='%.1f%%', labels = ['Police', 'Fire', 'Other', 'None', 'Ambulance'],
fontsize=12)
plt.ylabel('Authorities Contacted')
fig = plt.figure(figsize=(12,6))
ax = sns.countplot(x='auto_make', data=df)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.xlabel('Auto Make')
plt.ylabel('Auto Count')
plt.show()
fig = plt.figure(figsize=(6,6))
ax = (df['incident_severity'].value_counts()*100.0 /len(df))\
.plot.pie(autopct='%.1f%%', labels = ['Major Damage', 'Total Loss', 'Minor Damage', 'Trivial Damage'],
fontsize=12)
plt.ylabel('Incident Severity')
fig = plt.figure(figsize=(10,6))
ax = sns.countplot(x='insured_hobbies', data=df)
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.xlabel('Insured Hobbies')
plt.ylabel('Count of Insured')
plt.show()
df["insured_occupation"].value_counts()
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(10,6))
ax= df.groupby('auto_make').vehicle_claim.count().plot.bar(ylim=0)
ax.set_ylabel('Vehicle Claim')
ax.set_xlabel('Auto Make')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.show()
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(10,6))
ax= df.groupby('insured_hobbies').total_claim_amount.count().plot.bar(ylim=0)
ax.set_ylabel('Total Claim Amount')
ax.set_xlabel('Insured Hobbies')
ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.show()
Cleaning up the data and prepare it for machine learning model.
df['fraud_reported'].replace(to_replace='Y', value=1, inplace=True)
df['fraud_reported'].replace(to_replace='N', value=0, inplace=True)
df.head()
df[['insured_zip']] = df[['insured_zip']].astype(object)
df.describe()
Some variables such as 'policy_bind_date', 'incident_date', 'incident_location' and 'insured_zip' contain very high number of level. We will remove these columns for our purposes.
Let's view summary of all the column with the object data-type :
df.describe(include='all')
Some values in the table are shown here as “NaN”. We will see how to deal with these missing values.
plt.style.use('fivethirtyeight')
plt.figure(figsize=(14,6))
table=pd.crosstab(df.policy_csl, df.fraud_reported)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Policy Csl vs Fraud', fontsize=12)
plt.xlabel('Policy Csl')
plt.ylabel('Fraud Reported')
plt.show()
policy_csl looks like an unavidable predictor.
df['csl_per_person'] = df.policy_csl.str.split('/', expand=True)[0]
df['csl_per_accident'] = df.policy_csl.str.split('/', expand=True)[1]
df['csl_per_person'].head()
df['csl_per_accident'].head()
df.auto_year.value_counts() # check the spread of years to decide on further action.
auto_year has 21 levels, and the number of records for each of the levels are quite significant considering datasize is not so large. We will do some feature engineering using this variable considering, the year of manufacturing of automobile indicates the age of the vehicle and may contain valuable information for insurance premium or fraud is concerned.
df['vehicle_age'] = 2018 - df['auto_year'] # Deriving the age of the vehicle based on the year value
df['vehicle_age'].head(10)
bins = [-1, 3, 6, 9, 12, 17, 20, 24] # Factorize according to the time period of the day.
names = ["past_midnight", "early_morning", "morning", 'fore-noon', 'afternoon', 'evening', 'night']
df['incident_period_of_day'] = pd.cut(df.incident_hour_of_the_day, bins, labels=names).astype(object)
df[['incident_hour_of_the_day', 'incident_period_of_day']].head(20)
# Check on categorical variables:
df.select_dtypes(include=['object']).columns # checking categorcial columns
# dropping unimportant columns
df = df.drop(columns = [
'policy_number',
'policy_csl',
'insured_zip',
'policy_bind_date',
'incident_date',
'incident_location',
'_c39',
'auto_year',
'incident_hour_of_the_day'])
df.head(2)
# identify variables with '?' values
unknowns = {}
for i in list(df.columns):
if (df[i]).dtype == object:
j = np.sum(df[i] == "?")
unknowns[i] = j
unknowns = pd.DataFrame.from_dict(unknowns, orient = 'index')
print(unknowns)
collision_type, property_damage, police_report_available contain many missing values. So, first isolate these variables, inspect these individually for spread of category values.
df.collision_type.value_counts()
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(10,6))
ax= df.groupby('collision_type').police_report_available.count().plot.bar(ylim=0)
ax.set_ylabel('Police Report')
ax.set_xlabel('Collision Type')
ax.set_xticklabels(ax.get_xticklabels(), rotation=10, ha="right")
plt.show()
df.property_damage.value_counts()
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(10,6))
ax= df.groupby('property_damage').police_report_available.count().plot.bar(ylim=0)
ax.set_ylabel('Police Report')
ax.set_xlabel('Property Damage')
ax.set_xticklabels(ax.get_xticklabels(), rotation=10, ha="right")
plt.show()
df.police_report_available.value_counts()
df.columns
df._get_numeric_data().head() # Checking numeric columns
df._get_numeric_data().columns
df.select_dtypes(include=['object']).columns # checking categorcial columns
Applying one-hot encoding to convert all categorical variables except out target variables
'collision_type', 'property_damage', 'police_report_available', 'fraud_reported'
dummies = pd.get_dummies(df[[
'policy_state',
'insured_sex',
'insured_education_level',
'insured_occupation',
'insured_hobbies',
'insured_relationship',
'incident_type',
'incident_severity',
'authorities_contacted',
'incident_state',
'incident_city',
'auto_make',
'auto_model',
'csl_per_person',
'csl_per_accident',
'incident_period_of_day']])
dummies = dummies.join(df[[
'collision_type',
'property_damage',
'police_report_available',
"fraud_reported"]])
dummies.head()
X = dummies.iloc[:, 0:-1] # predictor variables
y = dummies.iloc[:, -1] # target variable
len(X.columns)
X.head(2)
y.head()
from sklearn.preprocessing import LabelEncoder
X['collision_en'] = LabelEncoder().fit_transform(dummies['collision_type'])
X[['collision_type', 'collision_en']]
X['property_damage'].replace(to_replace='YES', value=1, inplace=True)
X['property_damage'].replace(to_replace='NO', value=0, inplace=True)
X['property_damage'].replace(to_replace='?', value=0, inplace=True)
X['police_report_available'].replace(to_replace='YES', value=1, inplace=True)
X['police_report_available'].replace(to_replace='NO', value=0, inplace=True)
X['police_report_available'].replace(to_replace='?', value=0, inplace=True)
X.head(10)
X = X.drop(columns = ['collision_type'])
X.head(2)
X = pd.concat([X, df._get_numeric_data()], axis=1) # joining numeric columns
X.head(2)
X.columns
X = X.drop(columns = ['fraud_reported']) # dropping target variable 'fraud_reported'
X.columns
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# evaluate an LDA model on the dataset using k-fold cross validation
model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=5, random_state=7)
result = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
print(result.mean())
print("Accuracy: %0.2f (+/- %0.2f)" % (result.mean(), result.std() * 2))
84 % cross validation score without standardizing the data. Above is the mean score and the 95% confidence interval of the score estimate. This looks good to go for other Classification methods.
Creating a Training Set for the Data Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=7)
print('length of X_train and X_test: ', len(X_train), len(X_test))
print('length of y_train and y_test: ', len(y_train), len(y_test))
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, classification_report, cohen_kappa_score
from sklearn import metrics
# Baseline Random forest based Model
rfc = RandomForestClassifier(n_estimators=200)
kfold = KFold(n_splits=5, random_state=7)
result2 = cross_val_score(rfc, X_train, y_train, cv=kfold, scoring='accuracy')
print(result2.mean())
# Generate a Histogram plot for anomaly detection
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = [15, 8]
df.plot(kind='hist')
plt.show()
plt.rcParams['figure.figsize'] = [5, 5]
sns.boxplot(x=X.policy_annual_premium)
plt.xlabel('Policy Annual Premium')
plt.show()
plt.rcParams['figure.figsize'] = [5, 5]
sns.boxplot(x=X.witnesses)
plt.xlabel('Witnesses')
plt.show()
plt.rcParams['figure.figsize'] = [5, 5]
sns.boxplot(x=X.vehicle_age)
plt.xlabel('Vehicle Age')
plt.show()
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns) # retaining columns names
X_train_scaled.head(2)
# Generate a Histogram plot on scaled data to check anomalies
plt.rcParams['figure.figsize'] = [15, 8]
X_train_scaled.plot(kind='hist')
x_train_scaled = pd.DataFrame.to_numpy(X_train_scaled) # converting to array for computational ease
x_train_scaled
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
xgb = XGBClassifier()
logreg= LogisticRegressionCV(solver='lbfgs', cv=10)
knn = KNeighborsClassifier(5)
svcl = SVC()
adb = AdaBoostClassifier()
dt = DecisionTreeClassifier(max_depth=5)
rf = RandomForestClassifier()
lda = LinearDiscriminantAnalysis()
gnb = GaussianNB()
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegressionCV(solver='lbfgs', max_iter=5000, cv=10)))
models.append(('XGB', XGBClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('SVM', SVC(gamma='auto')))
models.append(('RF', RandomForestClassifier(n_estimators=200)))
models.append(('ADA', AdaBoostClassifier(n_estimators=200)))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('GNB', GaussianNB()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, x_train_scaled, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison
plt.rcParams['figure.figsize'] = [15, 8]
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
clf1= LogisticRegressionCV(solver='lbfgs', max_iter=5000, cv=10)
clf2 = XGBClassifier()
clf = [
('LR', clf1),
('XGB', clf2)]
#create our voting classifier, inputting our models
eclf= VotingClassifier(estimators=[
('LR', clf1),
('XGB', clf2)], voting='hard')
for clf, label in zip([clf1, clf2, eclf], [
'Logistic Regression',
'XGB Classifier',
'Ensemble']):
scores = cross_val_score(clf, x_train_scaled, y_train, cv=10, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
from numpy import sort
from sklearn.feature_selection import SelectFromModel
# fit model on all training data
xgb = XGBClassifier()
xgb.fit(x_train_scaled, y_train)
# make predictions for test data and evaluate
xgb_pred = xgb.predict(X_test_scaled)
predictions = [round(value) for value in xgb_pred]
accuracy = accuracy_score(y_test, xgb_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
# Fit model using each importance as a threshold
thresholds = sort(xgb.feature_importances_)
for thresh in thresholds:
# select features using threshold
selection = SelectFromModel(xgb, threshold=thresh, prefit=True)
select_X_train = selection.transform(x_train_scaled)
# train model
selection_model = XGBClassifier()
selection_model.fit(select_X_train, y_train)
# eval model
select_X_test = selection.transform(X_test_scaled)
xgb_pred = selection_model.predict(select_X_test)
predictions = [round(value) for value in xgb_pred]
accuracy = accuracy_score(y_test, xgb_pred)
print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0))
from xgboost import plot_importance
x = XGBClassifier()
x.fit(X_train_scaled, y_train) # fitting the model again on dataframe to identify the feature names
plt.rcParams['figure.figsize'] = [25, 20]
# plot feature importance
plot_importance(x)
from pprint import pprint
# Check parameters used
print('Parameters currently in use:\n')
pprint(x.get_params())
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot
plt.rcParams['figure.figsize'] = [10, 6]
# grid search
max_depth = range(1, 11, 2)
print(max_depth)
param_grid = dict(max_depth=max_depth)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(xgb, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold, verbose=1, iid=False)
grid_result = grid_search.fit(x_train_scaled, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
# plot
pyplot.errorbar(max_depth, means, yerr=stds)
pyplot.title("XGBoost max_depth vs Log Loss")
pyplot.xlabel('max_depth')
pyplot.ylabel('Log Loss')
import numpy
n_estimators = [50, 100, 150, 200]
max_depth = [2, 4, 6, 8]
print(max_depth)
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(xgb, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold, verbose=1, iid=False)
grid_result = grid_search.fit(x_train_scaled, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
# plot results
scores = numpy.array(means).reshape(len(max_depth), len(n_estimators))
for i, value in enumerate(max_depth):
pyplot.plot(n_estimators, scores[i], label='depth: ' + str(value))
pyplot.legend()
pyplot.xlabel('n_estimators')
pyplot.ylabel('Log Loss')
xgb = XGBClassifier(objective='binary:logistic', random_state=7, n_jobs=-1)
xgb.fit(x_train_scaled, y_train)
scores = cross_val_score(xgb, x_train_scaled, y_train, cv=kfold, scoring='brier_score_loss')
print('Brier loss:', "{0:.5f}".format(np.mean(scores)*-1))
print(xgb.get_params())
from sklearn.model_selection import RandomizedSearchCV
# Create the parameter grid
params = {
'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
'n_estimators': [int(x) for x in np.linspace(start=100, stop=500, num=9)],
'max_depth': [i for i in range(3, 10)],
'min_child_weight': [i for i in range(1, 7)],
'subsample': [i/10.0 for i in range(6,11)],
'colsample_bytree': [i/10.0 for i in range(6,11)]
}
# Create the randomised grid search model
# "n_iter = number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution"
rgs = RandomizedSearchCV(estimator=xgb, param_distributions=params, n_iter=200, cv=kfold,
random_state=7, n_jobs=-1,
scoring='brier_score_loss', return_train_score=True)
# Fit rgs
rgs.fit(x_train_scaled, y_train)
# Print results
print(rgs)
best_score = rgs.best_score_
best_params = rgs.best_params_
print("Best score: {}".format(best_score))
print("Best params: ")
for param_name in sorted(best_params.keys()):
print('%s: %r' % (param_name, best_params[param_name]))
# make predictions for test data and evaluate
rgs_pred = rgs.predict(X_test_scaled)
print('Accuracy: ', round(accuracy_score(y_test, rgs_pred)*100, 2))
print( 'Cohen Kappa: '+ str(np.round(cohen_kappa_score(y_test, rgs_pred),3)))
print('Recall: ', round(recall_score(y_test, rgs_pred)*100, 2))
print('\n Classification Report:\n', classification_report(y_test, rgs_pred))
print(result.mean())
xgb = XGBClassifier()
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('XGB', XGBClassifier()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, x_train_scaled, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# Fit rgs
model.fit(x_train_scaled, y_train)
# make predictions for test data
y_pred = model.predict(X_test_scaled)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test, rgs_pred)
print('Average precision-recall score: {0:0.2f}'.format(
average_precision))
from sklearn.metrics import precision_recall_curve
from inspect import signature
plt.rcParams['figure.figsize'] = [10, 6]
precision, recall, _ = precision_recall_curve(y_test, rgs_pred)
step_kwargs = ({'step': 'post'}
if 'step' in signature(plt.fill_between).parameters
else {})
plt.step(recall, precision, color='b', alpha=0.2,where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall curve: AP={0:0.2f}'.format(average_precision), fontsize=12)
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
# calculate AUC
auc = roc_auc_score(y_test, rgs_pred)
print('AUC: %.3f' % auc)
# calculate roc curve
fpr, tpr, thresholds = roc_curve(y_test, rgs_pred)
# plot no skill
plt.rcParams['figure.figsize'] = [10, 6]
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
from sklearn.metrics import confusion_matrix
import itertools
#Evaluation of Model - Confusion Matrix Plot
def plot_confusion_matrix(cm, classes, title ='Confusion matrix', normalize = False, cmap = plt.cm.Blues):
print('Confusion matrix')
print(cm)
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(10,6))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=40)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, rgs_pred)
np.set_printoptions(precision=2)
# Plot confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Fraud_Y','Fraud_N'],
title='Confusion matrix')
from sklearn.feature_selection import VarianceThreshold
constant_filter = VarianceThreshold(threshold=0.057)
constant_filter.fit(X_train_scaled)
constant_columns = [column for column in X_train_scaled.columns
if column not in X_train_scaled.columns[constant_filter.get_support()]]
print(len(constant_columns))
correlated_features = set()
correlation_matrix = X_train_scaled.corr()
for i in range(len(correlation_matrix .columns)):
for j in range(i):
if abs(correlation_matrix.iloc[i, j]) > 0.8:
colname = correlation_matrix.columns[i]
correlated_features.add(colname)
len(correlated_features)
print(correlated_features)
X.head(1)
x = X.drop([
'vehicle_claim',
'injury_claim',
'age',
'csl_per_accident_500',
'csl_per_accident_1000',
'auto_model_Wrangler',
'insured_sex_MALE',
'csl_per_accident_300',
'property_claim',
'number_of_vehicles_involved'], axis=1)
x.head(1)
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=7)
print('length of X_train and X_test: ', len(x_train), len(x_test))
print('length of y_train and y_test: ', len(y_train), len(y_test))
a_train_scaled = scaler.fit_transform(x_train)
a_test_scaled = scaler.transform(x_test)
xgb = XGBClassifier()
logreg= LogisticRegressionCV(solver='lbfgs', cv=10)
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegressionCV(solver='lbfgs', max_iter=5000, cv=10)))
models.append(('XGB', XGBClassifier()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, a_train_scaled, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)